library(tidyverse)
library(plotly)
# Read in csv files
ny <- read.csv('new_york_city.csv')
wash <- read.csv('washington.csv')
chi <- read.csv('chicago.csv')
# Displays first rows of each dataframe
head(ny)
## X Start.Time End.Time Trip.Duration
## 1 5688089 2017-06-11 14:55:05 2017-06-11 15:08:21 795
## 2 4096714 2017-05-11 15:30:11 2017-05-11 15:41:43 692
## 3 2173887 2017-03-29 13:26:26 2017-03-29 13:48:31 1325
## 4 3945638 2017-05-08 19:47:18 2017-05-08 19:59:01 703
## 5 6208972 2017-06-21 07:49:16 2017-06-21 07:54:46 329
## 6 1285652 2017-02-22 18:55:24 2017-02-22 19:12:03 998
## Start.Station End.Station User.Type Gender Birth.Year
## 1 Suffolk St & Stanton St W Broadway & Spring St Subscriber Male 1998
## 2 Lexington Ave & E 63 St 1 Ave & E 78 St Subscriber Male 1981
## 3 1 Pl & Clinton St Henry St & Degraw St Subscriber Male 1987
## 4 Barrow St & Hudson St W 20 St & 8 Ave Subscriber Female 1986
## 5 1 Ave & E 44 St E 53 St & 3 Ave Subscriber Male 1992
## 6 State St & Smith St Bond St & Fulton St Subscriber Male 1986
head(wash)
## X Start.Time End.Time Trip.Duration
## 1 1621326 2017-06-21 08:36:34 2017-06-21 08:44:43 489.066
## 2 482740 2017-03-11 10:40:00 2017-03-11 10:46:00 402.549
## 3 1330037 2017-05-30 01:02:59 2017-05-30 01:13:37 637.251
## 4 665458 2017-04-02 07:48:35 2017-04-02 08:19:03 1827.341
## 5 1481135 2017-06-10 08:36:28 2017-06-10 09:02:17 1549.427
## 6 1148202 2017-05-14 07:18:18 2017-05-14 07:24:56 398.000
## Start.Station
## 1 14th & Belmont St NW
## 2 Yuma St & Tenley Circle NW
## 3 17th St & Massachusetts Ave NW
## 4 Constitution Ave & 2nd St NW/DOL
## 5 Henry Bacon Dr & Lincoln Memorial Circle NW
## 6 1st & K St SE
## End.Station User.Type
## 1 15th & K St NW Subscriber
## 2 Connecticut Ave & Yuma St NW Subscriber
## 3 5th & K St NW Subscriber
## 4 M St & Pennsylvania Ave NW Customer
## 5 Maine Ave & 7th St SW Subscriber
## 6 Eastern Market Metro / Pennsylvania Ave & 7th St SE Subscriber
head(chi)
## X Start.Time End.Time Trip.Duration
## 1 1423854 2017-06-23 15:09:32 2017-06-23 15:14:53 321
## 2 955915 2017-05-25 18:19:03 2017-05-25 18:45:53 1610
## 3 9031 2017-01-04 08:27:49 2017-01-04 08:34:45 416
## 4 304487 2017-03-06 13:49:38 2017-03-06 13:55:28 350
## 5 45207 2017-01-17 14:53:07 2017-01-17 15:02:01 534
## 6 1473887 2017-06-26 09:01:20 2017-06-26 09:11:06 586
## Start.Station End.Station User.Type Gender
## 1 Wood St & Hubbard St Damen Ave & Chicago Ave Subscriber Male
## 2 Theater on the Lake Sheffield Ave & Waveland Ave Subscriber Female
## 3 May St & Taylor St Wood St & Taylor St Subscriber Male
## 4 Christiana Ave & Lawrence Ave St. Louis Ave & Balmoral Ave Subscriber Male
## 5 Clark St & Randolph St Desplaines St & Jackson Blvd Subscriber Male
## 6 Clinton St & Washington Blvd Canal St & Taylor St Subscriber Male
## Birth.Year
## 1 1992
## 2 1992
## 3 1981
## 4 1986
## 5 1975
## 6 1990
# Data manipulation
calculate_age <- function(df, birth_year, start_time) {
#' Calculates user age by subtracting birth year from start year
# Converts Start.Time to Date datatype
df[[start_time]] <- as.Date(df[[start_time]])
# Extracts year from Start.Time
start_year <- year(df[[start_time]])
# Calculates difference in years
age_difference <- start_year - df[[birth_year]]
# Returns age
return(age_difference)
}
# Creating new column for age and applying function
ny$age <- calculate_age(ny, "Birth.Year", "Start.Time")
chi$age <- calculate_age(chi, "Birth.Year", "Start.Time")
# Creating new column for city name
ny$city <- "New York"
chi$city <- "Chicago"
wash$city <- "Washington"
# Combining ny and chi dataframes into one dataframe
ny_chi <- rbind(ny, chi)
# Removing dissimilar columns compared to wash
ny_trunc <- ny %>%
select(-one_of("Gender", "Birth.Year", "age"))
chi_trunc <- chi %>%
select(-one_of("Gender", "Birth.Year", "age"))
# Combining ny, chi, and wash dataframes into one dataframe
all_city <- rbind(ny_trunc, chi_trunc, wash)
subscriber_age <- ny_chi %>%
# Filters for Subscribers, non-null age, and non-null gender
filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
group_by(city, Gender, age) %>%
summarise(subscriber_count = n())
# Finds highest subscriber count for females in New York
max_f_ny <- subscriber_age %>%
filter(city == "New York", Gender == "Female") %>%
arrange(desc(subscriber_count)) %>%
slice(1)
max_f_ny
## # A tibble: 1 × 4
## # Groups: city, Gender [1]
## city Gender age subscriber_count
## <chr> <chr> <dbl> <int>
## 1 New York Female 28 574
# Finds highest subscriber count for males in New York
max_m_ny <- subscriber_age %>%
filter(city == "New York", Gender == "Male") %>%
arrange(desc(subscriber_count)) %>%
slice(1)
max_m_ny
## # A tibble: 1 × 4
## # Groups: city, Gender [1]
## city Gender age subscriber_count
## <chr> <chr> <dbl> <int>
## 1 New York Male 32 1486
# Finds highest subscriber count for females in Chicago
max_f_chi <- subscriber_age %>%
filter(city == "Chicago", Gender == "Female") %>%
arrange(desc(subscriber_count)) %>%
slice(1)
max_f_chi
## # A tibble: 1 × 4
## # Groups: city, Gender [1]
## city Gender age subscriber_count
## <chr> <chr> <dbl> <int>
## 1 Chicago Female 25 114
# Finds highest subscriber count for males in Chicago
max_m_chi <- subscriber_age %>%
filter(city == "Chicago", Gender == "Male") %>%
arrange(desc(subscriber_count)) %>%
slice(1)
max_m_chi
## # A tibble: 1 × 4
## # Groups: city, Gender [1]
## city Gender age subscriber_count
## <chr> <chr> <dbl> <int>
## 1 Chicago Male 28 287
# Finds count for females and males in each city
f_m_count <- ny_chi %>%
filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
group_by(city, Gender) %>%
summarise(count = n())
f_m_count
## # A tibble: 4 × 3
## # Groups: city [2]
## city Gender count
## <chr> <chr> <int>
## 1 Chicago Female 1723
## 2 Chicago Male 5159
## 3 New York Female 12159
## 4 New York Male 36625
graph1 <- ny_chi %>%
# Filters for Subscribers, non-null age, and non-null gender
filter(User.Type == "Subscriber" & !(is.na(age)) & Gender=="Male" | Gender=="Female") %>%
# Creates histogram with labels
ggplot(aes(x = age)) +
geom_histogram(binwidth=5, color="black", fill="lightblue") +
labs(title="Distribution of Age by Gender in New York vs Chicago", x="Age", y="Count") +
# facets by gender and city
facet_grid(city~Gender) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 100, by=10))
# Converts to a plotly graph for tooltip functionality
ggplotly(graph1)
The histograms display the distribution of age faceted by gender for New York and Chicago. While the counts for Chicago are lower than the counts for New York, it can be seen that for both cities, there are more men than women riders. In New York, there are 36,625 men compared to 12,159 women. In Chicago, there are 5,159 men and 1,723 women. Both cities’ histograms are also skewed right for both men and women. It appears that most riders are between people in their early twenties to mid-thirties, regardless of city. The most frequent age for female subscribers in New York is 28 while it is 32 for male subscribers. The most frequent age for female subscribers in Chicago is 25 while it is 28 for males in Chicago.
# Creates new column for converting trip duration from seconds to minutes
all_city$Trip.Duration.Mins <- all_city$Trip.Duration / 60
avg_trip <- all_city %>%
# Filters for non-null user types
filter(User.Type == "Subscriber" | User.Type == "Customer") %>%
group_by(city, User.Type) %>%
# Calculating average trip duration by city and user type
summarise(Average = mean(Trip.Duration.Mins, na.rm=T))
avg_trip
## # A tibble: 6 × 3
## # Groups: city [3]
## city User.Type Average
## <chr> <chr> <dbl>
## 1 Chicago Customer 32.2
## 2 Chicago Subscriber 11.4
## 3 New York Customer 36.6
## 4 New York Subscriber 12.6
## 5 Washington Customer 43.9
## 6 Washington Subscriber 12.2
graph2 <- avg_trip %>%
# Creates grouped bar chart with labels
ggplot(aes(x=city, y=Average, fill=User.Type)) +
geom_col(position = "dodge") +
labs(title="Average Trip Duration by User Type by City",
x="City",
y="Trip Duration (Minutes)",
fill="User Type") +
theme_minimal() +
scale_fill_brewer(palette="Paired")
# Converts to a plotly graph for tooltip functionality
ggplotly(graph2)
The grouped bar chart shows that across all cities in the data, customers take longer average trips than subscribers. Customers in Washington seem to take the longest trips on average, followed by New York, and then Chicago at 43.9 minutes, 36.6 minutes, and 32.2 minutes, respectively. The subscribers across all cities take similar trip duration on average at 11.4 minutes for Chicago, 12.6 minutes for New York and 12.2 minutes for Washington. This could be due to one-time customers tending to be tourists and subscribers tending to be locals using bike sharing for commuting to work, however this cannot be proven or denied based on the dataset.
extract_hour_day <- function(df, start_time) {
#' Extracts the hour and day of the week from Start.Time and order the days of the week
# Convert Start.Time to POSIXct datetime format
df[[start_time]] <- as.POSIXct(df[[start_time]], format = "%Y-%m-%d %H:%M:%S")
# Extract hour and day of the week from Start.Time
df$hour <- format(df[[start_time]], "%H")
df$day_of_week <- format(df[[start_time]], "%A")
# Order days of the week
df$day_of_week <- factor(df$day_of_week, levels=c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday",
"Tuesday", "Monday"))
return(df)
}
usage_df <- function(df) {
#' Filters, groups by day of the week and hour, and finds trip count for the df
# Filters and summarises dataframe
usage_detail <- df %>%
filter(!(is.na(day_of_week)) & !(is.na(hour))) %>%
group_by(day_of_week, hour) %>%
summarise(trip_count = n(), .groups = 'drop')
return(usage_detail)
}
plot_heatmap <- function(df) {
#' Plots the usage detail df in a heatmap
plot <- ggplot(df, aes(x = hour, y = day_of_week, fill = trip_count)) +
# Creates heatmap with labels
geom_tile() +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Bike Usage by Hour and Day of the Week",
x = "Hour of the Day",
y = "Day of the Week",
fill = "Trip Count")
return(plot)
}
# Extracts hour and day of week from New York dataframe
hour_day_ny <- extract_hour_day(ny, "Start.Time")
# Returns usage dataframe
detail_ny <- usage_df(hour_day_ny)
# Finds top 5 highest usage times
usage_summary <- detail_ny %>%
arrange(desc(trip_count)) %>%
slice(c(1, n()))
# Prints top 5 usage times
usage_summary
## # A tibble: 2 × 3
## day_of_week hour trip_count
## <fct> <chr> <int>
## 1 Wednesday 17 1069
## 2 Tuesday 03 4
# Returns heatmap for New York dataframe
plot_heatmap(detail_ny)
# Extracts hour and day of week from Chicago dataframe
hour_day_chi <- extract_hour_day(chi, "Start.Time")
# Returns usage dataframe
detail_chi <- usage_df(hour_day_chi)
# Finds top 5 highest usage times
usage_summary2 <- detail_chi %>%
arrange(desc(trip_count)) %>%
slice(c(1, n()))
# Prints top 5 usage times
usage_summary2
## # A tibble: 2 × 3
## day_of_week hour trip_count
## <fct> <chr> <int>
## 1 Thursday 17 208
## 2 Wednesday 02 1
# Returns heatmap for Chicago dataframe
plot_heatmap(detail_chi)
# Extracts hour and day of week from Washington dataframe
hour_day_wash <- extract_hour_day(wash, "Start.Time")
# Returns usage dataframe
detail_wash <- usage_df(hour_day_wash)
# Finds top 5 highest usage times
usage_summary3 <- detail_wash %>%
arrange(desc(trip_count)) %>%
slice(c(1, n()))
# Prints top 5 usage times
usage_summary3
## # A tibble: 2 × 3
## day_of_week hour trip_count
## <fct> <chr> <int>
## 1 Wednesday 08 1962
## 2 Thursday 00 19
# Returns heatmap for Washington dataframe
plot_heatmap(detail_wash)
The heatmaps show different levels of usage throughout each hour of the day by day of the week for each city. The heatmap for New York shows that the days and times with the heaviest usage are on weekdays between 7am - 9am and 5pm - 7pm. Weekends during the day also seem to be popular in New York. The day and time with the highest usage in New York is Wednesday at 5pm while the lowest usage is Tuesday at 3am.
This pattern is also mirrored pretty similarly in Chicago where the heaviest usage times are weekdays from around 7am - 8am and around 4pm - 6pm. Weekends during the day also seem to be popular. The day and time with the highest usage in Chicago is Thursday at 5pm and the lowest usage is Wednesday at 2 am. The weekday pattern could be due to professionals utilizing bike sharing to commute to and from work. The weekend patterns could be due to riders using the bikes for leisure.
Interestingly, in Washington D.C the heaviest usage times are on weekdays from around 5am - 8am, however this is not mirrored in the evening as expected. For weekends, it seems that the late night/ early morning time is the most popular. The day and time with the highest usage is Wednesday at 8am and the lowest usage is Thursday at 12am.